library(tidyverse)
library(statnet)
library(cvTools)
library(furrr)
library(glasso)
library(mvtnorm)
# remotes::install_github('leifeld/xergm')
library(xergm)
library(ndtv)
library(recipes)
library(bestNormalize)
library(changepoint)
theme_set(theme_bw())
data = read_csv('data.csv')
Generate Year and Quarter.
data = data |>
mutate(Year = year(Date),
Year = as_factor(Year),
Quarter = quarter(Date),
Quarter = as_factor(Quarter)) |>
select(Year,Quarter,everything(),-Date)
Check number of observation for each combination of Year
and Quarter.
data |>
count(Year,Quarter)
# A tibble: 41 × 3
Year Quarter n
<fct> <fct> <int>
1 2014 2 18
2 2014 3 65
3 2014 4 64
4 2015 1 63
5 2015 2 61
6 2015 3 65
7 2015 4 64
8 2016 1 62
9 2016 2 63
10 2016 3 65
# ℹ 31 more rows
Drop observations for the 2nd quarter of 2014 and the 2nd quarter of 2024.
data = data |>
filter(!((Year==2014 & Quarter==2) | (Year==2024 & Quarter==2)))
Scale and normalize each variables except for Year and
Quarter.
Find optimal \(\rho\) according to
the change point of metric (square) generated by cross validation for
each combination of Year and Quarter.
# Function for normalizing variables for each cv-fold
scale_cv_data = function(k, folds, data){
id_test = folds$subsets[folds$which==k]
train_data = data |> slice(-id_test)
test_data = data |> slice(id_test)
set.seed(1)
preprocessor = recipe(~., data = train_data) |>
step_best_normalize(all_numeric_predictors()) |>
step_normalize(all_numeric_predictors())
result = list(scale_train = bake(prep(preprocessor),new_data = train_data),
scale_test = bake(prep(preprocessor),new_data = test_data))
return(result)
}
# Function for calculating cv-metric (square) given rho for each combination of Year and Quarter
lasso_tune = function(rho, train_data_list, test_data_list){
K = length(train_data_list)
metric = rep(NA, K)
for(k in 1:K){
s = cov(train_data_list[[k]])
p = ncol(s)
nobs = nrow(test_data_list[[k]])
set.seed(1)
graphical_lasso = glasso(s = s,
rho = rho,
penalize.diagonal = FALSE)
sigma = graphical_lasso$w
log_density = dmvnorm(x = test_data_list[[k]],
mean = rep(0,p),
sigma = sigma,
log = TRUE)
inv_cov = graphical_lasso$wi
diag(inv_cov) = 0
num_edge = sum(inv_cov!=0)/2
metric[k] = mean(log_density, na.rm = TRUE) / ((num_edge + 1)^2)
}
avg_metric = mean(metric)
return(avg_metric)
}
# Candidate rho values
rho_seq = seq(0.001,1,0.001)
# Combinations of Year and Quarter
combination = data |>
distinct(Year,Quarter) |>
mutate(rho = NA)
# Number of folds in cv
K = 10
# Find optimal rho according to the change point of metric (square) generated by cross validation for each combination of Year and Quarter
for(i in 1:nrow(combination)){
# Data for specific Year and Quarter
temp_data = data |>
filter(Year==combination$Year[i],
Quarter==combination$Quarter[i]) |>
select(-Year,-Quarter)
set.seed(1)
# Create cv-folds for specific Year and Quarter data
temp_folds = cvFolds(n = nrow(temp_data), K = K)
# Normalize variables for each fold
plan(multisession, workers = parallel::detectCores()-1)
temp_scale_data_list = future_map(.x = 1:K,
.f = scale_cv_data,
folds = temp_folds,
data = temp_data,
.progress = TRUE)
temp_scale_train_data = map(.x = temp_scale_data_list,
.f = ~.x$scale_train)
temp_scale_test_data = map(.x = temp_scale_data_list,
.f = ~.x$scale_test)
# Calculate cv-metric (square) for each rho value on the data corresponding to specific combination of Year and Quarter
plan(multisession, workers = parallel::detectCores()-1)
temp_metric = future_map_dbl(.x = rho_seq,
.f = lasso_tune,
train_data_list = temp_scale_train_data,
test_data_list = temp_scale_test_data,
.progress = TRUE)
# Calculate the first-order and second-order derivatives of the cv-metric corresponding to each rho
temp_derivatives = tibble(rho = rho_seq,
metric = temp_metric) |>
mutate(slope1 = c(NA,diff(metric)) / c(NA,diff(rho)),
slope2 = c(NA,diff(slope1)) / c(NA,diff(rho)))
# Extract the second-order derivatives for rho fall in [0.05, 0.95]
temp_slope2 = temp_derivatives |>
filter(between(rho,0.05,0.95)) |>
select(rho,slope2)
# Find the optimal change point in variance
set.seed(1)
temp_opt_cpt = cpt.var(data = temp_slope2$slope2, # the second-order derivatives
penalty = 'MBIC', # penalty metric for calculate optimal positioning of change point
method = 'BinSeg',
test.stat = 'Normal', # assumed test distribution of the data
Q = 1) # maximum number of changepoints to search for using the "BinSeg" method
# Store the optimal rho for specific Year and Quarter data
combination$rho[i] = temp_slope2$rho[cpts(temp_opt_cpt)]
}
Check whether the results are reproducible.
mean(combination$rho) # 0.8047179
[1] 0.8047179
Create network for each combination of Year and
Quarter.
network_data = list()
relation_data = list()
# plots = list()
for(i in 1:nrow(combination)){
# # Data for specific Year and Quarter
temp_data = data |>
filter(Year==combination$Year[i],
Quarter==combination$Quarter[i]) |>
select(-Year,-Quarter)
set.seed(1)
# Normalize variables for specific Year and Quarter data
temp_preprocessor = recipe(~., data = temp_data) |>
step_best_normalize(all_numeric_predictors()) |>
step_normalize(all_numeric_predictors())
temp_scale_data = bake(prep(temp_preprocessor),new_data = temp_data)
# Estimate a graphical lasso for specific Year and Quarter data
graphical_lasso = glasso(s = cov(temp_scale_data), rho = combination$rho[i])
# Extract estimated inverse covariance matrix from graphical lasso
inv_sigma = graphical_lasso$wi
diag(inv_sigma) = 0
rownames(inv_sigma) = colnames(temp_scale_data)
colnames(inv_sigma) = colnames(temp_scale_data)
# Generate a network using estimated inverse covariance matrix
network_data[[i]] = as.network(x = inv_sigma, # binary matrix for relationship
directed = FALSE,
vertices = colnames(inv_sigma), # name of vertices
matrix.type = 'adjacency') |>
# set vertex attribute "num_relationship" for number of relationship
set.vertex.attribute(attrname = 'num_relationship',
value = colSums(inv_sigma!=0)) |>
# set edge attribute "degree_relationship" for relationship strength
set.edge.value(attrname = 'degree_relationship',
value = inv_sigma) |>
# set network attribute "degree_relationship" for relationship strength
set.network.attribute(attrname = 'degree_relationship',
value = inv_sigma)
relation_data[[i]] = inv_sigma
}
names(network_data) = str_c(combination$Year,
combination$Quarter,
sep = '-')
Estimate a TERGM by MPLE with temporal bootstrapping using ndependent covariates as following.
set.seed(1)
tergmFit1 = btergm(formula = network_data ~ edges +
triangle + kstar(k = 5) +
transitiveties +
edgecov('degree_relationship') +
nodematch('num_relationship') +
nodecov('num_relationship') +
absdiff('num_relationship') +
gwdegree(decay = 0.5,fixed = TRUE) +
gwesp(decay = 0.5, fixed = TRUE) +
memory(type = 'stability') +
timecov(transform = function(t) t) +
timecov(transform = function(t) t^2),
R = 10000, # Number of bootstrap replications
parallel = 'multicore',
ncpus = parallel::detectCores()-1,
usefastglm = TRUE,
verbose = FALSE)
summary(tergmFit1)
Estimate Boot mean 2.5% 97.5%
edges -8.6539e+00 -1.2565e+01 -12.3793 -7.5844
triangle -6.5887e-02 7.4380e-02 -0.2996 0.0942
kstar5 -1.2302e-03 -6.4653e-03 -0.0039 -0.0004
transitiveties -4.6934e-01 -6.0191e-01 -0.9528 1.1573
edgecov.degree_relationship -6.9295e+02 -2.9240e+04 -18103.3155 -342.7819
nodematch.num_relationship -5.7586e-01 -3.3125e+00 -1.3317 0.0902
nodecov.num_relationship 3.0900e-01 3.7542e-01 0.2453 0.4849
absdiff.num_relationship -1.1937e-01 -1.0292e-01 -0.2260 -0.0377
gwdeg.fixed.0.5 2.7852e+00 2.9804e+00 1.8024 3.9909
gwesp.fixed.0.5 2.0609e+00 1.6242e+00 0.6964 2.6881
edgecov.memory[[i]] 2.2013e-01 -1.6489e+00 0.0389 0.3673
edgecov.timecov1[[i]] -3.9973e-02 4.3376e-02 -0.1399 0.0642
edgecov.timecov2[[i]] 1.1145e-03 -6.2658e-04 -0.0013 0.0037
Continuously remove some non-significant independent covariates from the model until all independent covariates are statistically significant.
set.seed(1)
tergmFit2 = btergm(formula = network_data ~ edges +
kstar(k = 5) +
edgecov('degree_relationship') +
nodecov('num_relationship') +
absdiff('num_relationship') +
gwdegree(decay = 0.5,fixed = TRUE) +
gwesp(decay = 0.5, fixed = TRUE) +
memory(type = 'stability'),
R = 10000, # Number of bootstrap replications
parallel = 'multicore',
ncpus = parallel::detectCores()-1,
usefastglm = TRUE,
verbose = FALSE)
summary(tergmFit2)
Estimate Boot mean 2.5% 97.5%
edges -8.6537e+00 -1.1854e+01 -11.3612 -7.7807
kstar5 -1.3649e-03 -6.2609e-03 -0.0039 -0.0006
edgecov.degree_relationship -6.8587e+02 -3.9589e+04 -17935.9694 -345.9128
nodecov.num_relationship 2.8482e-01 3.3775e-01 0.2003 0.4194
absdiff.num_relationship -7.1573e-02 -3.9644e-02 -0.1294 -0.0140
gwdeg.fixed.0.5 2.4119e+00 2.5264e+00 1.3556 3.3449
gwesp.fixed.0.5 1.6416e+00 1.6463e+00 1.2332 2.0369
edgecov.memory[[i]] 2.1827e-01 -2.0118e+00 0.0281 0.3610
Assess goodness of fit of the two btergm models using dyad-wise
shared partner distribution (dsp), edge-wise shared partner
distribution (esp), degree distribution (deg),
geodesic distance distribution (geodesic).
The model is said to fit better the closer the medians of the box plots (based on the simulated networks) come to the line that plots the actual value of these statistics in the observed network.
set.seed(1)
gof1 = gof(tergmFit1,
nsim = 100, # number of networks to be simulated at each time step
parallel = 'multicore',
ncpus = parallel::detectCores()-1,
statistics = c(dsp, esp, deg, geodesic),
verbose = FALSE)
plot(gof1)
set.seed(1)
gof2 = gof(tergmFit2,
nsim = 100,
parallel = 'multicore',
ncpus = parallel::detectCores()-1,
statistics = c(dsp, esp, deg, geodesic),
verbose = FALSE)
plot(gof2)
First, set \(t\) to be 31~39. For fixed \(w\), fit tergm using the network data of [\(t-w\),\(t-1\)] periods, and predict the network of the \(t\) period, and then calculate the average out-of-sample metrics using the real network and the predicted network of the \(t\) period across all \(t\) values, where \(w\) is the window size of training network data. Repeat this step for different \(w\) (5~30) and get corresponding average out-of-sample metrics.
window_size = tibble(w = 5:30,
accuracy = NA,
recall = NA,
precision = NA,
f1_score = NA)
for(j in 1:nrow(window_size)){
accuracy = c()
recall = c()
precision = c()
f1_score = c()
for(i in 31:39){
set.seed(1)
temp_tergmFit = btergm(formula = network_data[(i-window_size$w[j]):(i-1)] ~ edges +
kstar(k = 5) +
edgecov('degree_relationship') +
nodecov('num_relationship') +
absdiff('num_relationship') +
gwdegree(decay = 0.5,fixed = TRUE) +
gwesp(decay = 0.5, fixed = TRUE) +
memory(type = 'stability'),
R = 10000, # Number of bootstrap replications
parallel = 'multicore',
ncpus = parallel::detectCores()-1,
usefastglm = TRUE,
verbose = FALSE)
# Predict the network of t period
temp_prediction = simulate(object = temp_tergmFit,
nsim = 10000, # number of network to simulate
seed = 1)
# Transform predicted networks to be matrice
pre_network = map(.x = temp_prediction,
.f = as.matrix)
# Real network of t period
real_network = as.matrix(network_data[[i]])
# Calculate accuracy, precision, recall, F1 score for the presence of an edge in the network of t period
metrics = map(.x = pre_network,
.f = function(.x){
# Upper Triangular Part of Real Network
upper_tri_mat_real = real_network[upper.tri(real_network)]
# Upper Triangular Part of Predicted Network
upper_tri_mat_pre = .x[upper.tri(.x)]
# Calculate metrics
accuracy = mean(upper_tri_mat_real==upper_tri_mat_pre)
recall = sum(upper_tri_mat_real==1 & upper_tri_mat_pre==1)/sum(upper_tri_mat_real==1)
precision = sum(upper_tri_mat_real==1 & upper_tri_mat_pre==1)/sum(upper_tri_mat_pre==1)
f_score = 2*precision*recall/(precision+recall)
return(tibble(accuracy = accuracy,
recall = recall,
precision = precision,
f_score = f_score))
})
# Average metrics
avg_metric = metrics |>
list_rbind() |>
summarise_all(.funs = mean)
accuracy = c(accuracy,avg_metric$accuracy)
recall = c(recall,avg_metric$recall)
precision = c(precision,avg_metric$precision)
f1_score = c(f1_score,avg_metric$f_score)
}
window_size$accuracy[j] = mean(accuracy)
window_size$recall[j] = mean(recall)
window_size$precision[j] = mean(precision)
window_size$f1_score[j] = mean(f1_score)
}
window_size %>%
pivot_longer(cols = -w) %>%
ggplot(aes(x = w, y = value)) +
geom_point() +
geom_line() +
facet_wrap(~name,scales = 'free') +
labs(x = 'Window size of training network data',
y = 'Average out-of-sample metric values')
According to the average out-of-sample F1 score, the more training networks used to build tergm, the higher the F1 score of the model to predict whether there is a relationship between two currencies. Therefore, I believe that when predicting the network of \(t\)period, we should use all \(t-1\)periods networks to model to ensure a high F1 score.
Fit tergm using the network data of previous \(t-1\) periods, and predict the network of the \(t\) period, calculate accuracy, precision, recall, F1 score using the real network and the predicted network of the \(t\) period. Here, we set \(t\) to be 31~39.
for(i in 31:39){
set.seed(1)
temp_tergmFit = btergm(formula = network_data[1:(i-1)] ~ edges +
kstar(k = 5) +
edgecov('degree_relationship') +
nodecov('num_relationship') +
absdiff('num_relationship') +
gwdegree(decay = 0.5,fixed = TRUE) +
gwesp(decay = 0.5, fixed = TRUE) +
memory(type = 'stability'),
R = 10000, # Number of bootstrap replications
parallel = 'multicore',
ncpus = parallel::detectCores()-1,
usefastglm = TRUE,
verbose = FALSE)
# Predict the network of t period
temp_prediction = simulate(object = temp_tergmFit,
nsim = 10000, # number of network to simulate
seed = 1)
# Transform predicted networks to be matrice
pre_network = map(.x = temp_prediction,
.f = as.matrix)
# Real network of t period
real_network = as.matrix(network_data[[i]])
# Calculate accuracy, precision, recall, F1 score for the presence of an edge in the network of t period
metrics = map(.x = pre_network,
.f = function(.x){
# Upper Triangular Part of Real Network
upper_tri_mat_real = real_network[upper.tri(real_network)]
# Upper Triangular Part of Predicted Network
upper_tri_mat_pre = .x[upper.tri(.x)]
# Calculate metrics
accuracy = mean(upper_tri_mat_real==upper_tri_mat_pre)
recall = sum(upper_tri_mat_real==1 & upper_tri_mat_pre==1)/sum(upper_tri_mat_real==1)
precision = sum(upper_tri_mat_real==1 & upper_tri_mat_pre==1)/sum(upper_tri_mat_pre==1)
f_score = 2*precision*recall/(precision+recall)
return(tibble(accuracy = accuracy,
recall = recall,
precision = precision,
f_score = f_score))
})
# Average metrics
avg_metric = metrics |>
list_rbind() |>
summarise_all(.funs = mean)
# Print metrics
cat(str_c('Accuracy for the ',i,'-th period: ',round(avg_metric$accuracy,3),'\n'))
cat(str_c('Precision for the ',i,'-th period: ',round(avg_metric$precision,3),'\n'))
cat(str_c('Recall for the ',i,'-th period: ',round(avg_metric$recall,3),'\n'))
cat(str_c('F1 score for the ',i,'-th period: ',round(avg_metric$f_score,3),'\n'))
}
Accuracy for the 31-th period: 0.791
Precision for the 31-th period: 0.46
Recall for the 31-th period: 0.361
F1 score for the 31-th period: 0.404
Accuracy for the 32-th period: 0.781
Precision for the 32-th period: 0.278
Recall for the 32-th period: 0.308
F1 score for the 32-th period: 0.292
Accuracy for the 33-th period: 0.816
Precision for the 33-th period: 0.763
Recall for the 33-th period: 0.449
F1 score for the 33-th period: 0.565
Accuracy for the 34-th period: 0.759
Precision for the 34-th period: 0.368
Recall for the 34-th period: 0.579
F1 score for the 34-th period: 0.449
Accuracy for the 35-th period: 0.822
Precision for the 35-th period: 0.409
Recall for the 35-th period: 0.484
F1 score for the 35-th period: 0.443
Accuracy for the 36-th period: 0.742
Precision for the 36-th period: 0.414
Recall for the 36-th period: 0.25
F1 score for the 36-th period: 0.311
Accuracy for the 37-th period: 0.727
Precision for the 37-th period: 0.37
Recall for the 37-th period: 0.428
F1 score for the 37-th period: 0.397
Accuracy for the 38-th period: 0.787
Precision for the 38-th period: 0.372
Recall for the 38-th period: 0.52
F1 score for the 38-th period: 0.434
Accuracy for the 39-th period: 0.85
Precision for the 39-th period: 0.226
Recall for the 39-th period: 0.328
F1 score for the 39-th period: 0.267
Forecast the next network using the second btergm model fitted using all network data.
prediction = simulate(object = tergmFit2,
nsim = 10000, # number of response vectors to simulate
seed = 1)
Transform the prediction result into matrix.
prediction_matrix = map(.x = prediction,
.f = as.matrix.network)
Calculate proportion of relationship.
final_prediction_matrix = reduce(.x = prediction_matrix,
.f = `+`)/length(prediction)
final_prediction = ifelse(final_prediction_matrix<0.01,0,final_prediction_matrix)
Define function for plotting network.
plot_history_network = function(adjacency_matrix, title, layout = layout_with_fr, size_param = 10){
library(igraph)
adjacency_matrix[lower.tri(adjacency_matrix)] = t(adjacency_matrix)[lower.tri(adjacency_matrix)]
graph = graph.adjacency(abs(adjacency_matrix),
mode = "undirected",
weighted = TRUE)
V(graph)$name = colnames(adjacency_matrix)
V(graph)$label = V(graph)$name
deg = degree(graph, mode = "all")
V(graph)$size = log(deg + 2) * size_param
E(graph)$width = E(graph)$weight * 80
set.seed(1)
result = plot(graph,
layout = layout,
vertex.color = "lightblue",
edge.label.cex = 0.8,
main = title)
return(result)
}
# Historical networks
map2(.x = relation_data,
.y = names(network_data),
.f = ~plot_history_network(adjacency_matrix = .x,title = .y))
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
[[6]]
NULL
[[7]]
NULL
[[8]]
NULL
[[9]]
NULL
[[10]]
NULL
[[11]]
NULL
[[12]]
NULL
[[13]]
NULL
[[14]]
NULL
[[15]]
NULL
[[16]]
NULL
[[17]]
NULL
[[18]]
NULL
[[19]]
NULL
[[20]]
NULL
[[21]]
NULL
[[22]]
NULL
[[23]]
NULL
[[24]]
NULL
[[25]]
NULL
[[26]]
NULL
[[27]]
NULL
[[28]]
NULL
[[29]]
NULL
[[30]]
NULL
[[31]]
NULL
[[32]]
NULL
[[33]]
NULL
[[34]]
NULL
[[35]]
NULL
[[36]]
NULL
[[37]]
NULL
[[38]]
NULL
[[39]]
NULL
# 2015-2
plot_history_network(adjacency_matrix = relation_data[[which(names(network_data)=='2015-2')]],
title = '2015-2',
layout = layout_with_gem,
size_param = 5)
NULL
# Function for visualize predicted network
plot_future_network = function(adjacency_matrix,
title,
size_param = 10,
width_param = 80){
library(igraph)
adjacency_matrix[lower.tri(adjacency_matrix)] = t(adjacency_matrix)[lower.tri(adjacency_matrix)]
graph = graph.adjacency(abs(adjacency_matrix),
mode = "undirected",
weighted = TRUE)
V(graph)$name = colnames(adjacency_matrix)
V(graph)$label = V(graph)$name
deg = degree(graph, mode = "all")
V(graph)$size = log(deg + 2) * size_param
E(graph)$width = E(graph)$weight * width_param
set.seed(1)
result = plot(graph,
layout = layout_with_fr,
vertex.color = "lightblue",
edge.label.cex = 0.8,
main = title)
return(result)
}
# Visualize predicted network
plot_future_network(adjacency_matrix = final_prediction,
title = '2024-2',
size_param = 8,
width_param = 5)
NULL
# Visualize predicted network for edges=1
new_final_prediction = ifelse(final_prediction==1,1,0)
plot_future_network(adjacency_matrix = new_final_prediction,
title = '2024-2',
size_param = 11,
width_param = 5)
NULL